Hey Professor,
I had this done on time but forgot to turn it in on time. I usually turn it in during class on Tuesdays as I have done with all the other assignments I have done but I forgot. Could I get the late penalty waived? If not, it is fine. I understand.
Best, Ricky
Our goal in this assignment is to apply our logistic regression technique to some new samples:
Pulsars: Apply the technique to the pulsar dataset, but using only two variables (which you can pick). Note that in the Logistic Regression notebook we used the Pulsar data to introduce Logistic Regression, we actually applied the Regression method to the University Admission dataset.
MNIST single digit: Apply the Logistic Regression technique to the MINST dataset, using one digit (say 5) as the positive (y=1) class, and another digit (say 7) as the negative (y=0) class.
Extra: Try to do a simple version of a multi-class classification problem using MNIST: use 3 digits. Your primary output should be a confusion matrix. Hint:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
# Used to implement the multi-dimensional counter we need in the performance class
from collections import defaultdict
from functools import partial
from itertools import repeat
from sklearn.model_selection import train_test_split
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
You will need the following:
def sigmoid(z):
sm = 1.0 / (1.0 + np.exp(z))
return sm
def get_prob(Theta,Xp):
hTheta = sigmoid(-np.dot(Xp,Theta))
return hTheta.item(0)
def calc_cost_logistic(Theta,Xp,yp):
hTheta = sigmoid(-np.dot(Xp,Theta))
cost = np.dot(-yp.T,np.log(hTheta)) - np.dot((1.0 -yp).T,np.log(1.0-hTheta))
#
# Cost above is a 1x1 matrix - pull out the single value and return it
cost = cost.item(0)
return cost
def nested_defaultdict(default_factory, depth=1):
result = partial(defaultdict, default_factory)
for _ in repeat(None, depth - 1):
result = partial(defaultdict, result)
return result()
def calc_gradient_descent_logistic(Theta,Xp,yp):
hTheta = sigmoid(-np.dot(Xp,Theta))
delTheta = np.dot(Xp.transpose(),(hTheta-yp))
return delTheta
def fit_data(Xp,yp,learningRate,max_iterations,scale=True,delta=0.001):
#
# Get the initial values
m,features = Xp.shape
#
# Set the starting theta values
Theta = np.random.randn(features,1)
Theta = np.zeros((features,1))
print("Starting theta",Theta.shape)
costList = []
#
# Calculate our initial cost
cost = calc_cost_logistic(Theta,Xp,yp)
cost_change = delta+0.1
iterations = 0
#
# In the while loop, "delta" is the precision
while (iterations<iterations_max) and (cost_change>delta):
last_cost = cost
#
# Update the theta parameters
Theta = Theta - learningRate*calc_gradient_descent_logistic(Theta,Xp,yp)
#
# Calculate the cost
cost = calc_cost_logistic(Theta,Xp,yp)
cost_change = last_cost - cost
#
# Store the cost
costList.append(cost)
iterations += 1
return Theta,iterations,costList
def plot_reg(X, y, beta):
y = y.reshape(len(y))
x_0 = X[np.where(y < 0.5)]
x_1 = X[np.where(y > 0.5)]
print("x_0",x_0.shape,x_1.shape)
x1_max = np.amax(x_1)
x1_min = np.amin(x_1)
print("x1 max,min",x1_max,x1_min)
# plotting points with diff color for diff label
fig = px.scatter(x=x_0[:, 1], y=x_0[:, 2],title="Scaled Decision Boundary Plot")
fig2 = px.scatter(x=x_1[:, 1], y=x_1[:, 2])
# plotting decision boundary
x1 = np.arange(x1_min, x1_max, 0.1)
#x1 = np.arange(0, 1, 0.1)
x2 = -(beta[0,0] + beta[1,0]*x1)/beta[2,0]
# uncomment these if you have 3 features
#x3 = np.arange(0, 1, 0.1)
#x2 = -(beta[0,0] + beta[1,0]*x1 + beta[3,0]*x3)/beta[2,0]
print("x1",x1.shape,x2.shape)
fig3= px.line(x=x1, y=x2)
#
# Without this, the colors of the two datasets are the same
fig.data[0]['marker'].update(color='red')
fig.data[0]['name']='Background'
fig.data[0]['showlegend']=True
fig2.data[0]['marker'].update(color='blue')
fig2.data[0]['name']='Signal'
fig2.data[0]['showlegend']=True
fig3.data[0]['line'].update(color='goldenrod')
fig3.data[0]['name']='Decision Boundary'
fig3.data[0]['showlegend']=True
#
# This next line actually puts the two datasets on the same plot
fig.add_trace(fig2.data[0])
fig.add_trace(fig3.data[0])
#
# Now we plot them
fig.show()
def plot_reg_scale(X, y, beta, scl):
Xt = scl.inverse_transform(X[:,1:])
y = y.reshape(len(y))
x_0 = Xt[np.where(y < 0.5)]
x_1 = Xt[np.where(y > 0.5)]
#
# Inverse transform the points
# x_0 = scl.inverse_transform(x_0[:,1:])
# x_1 = scl.inverse_transform(x_1[:,1:])
fig = px.scatter(x=x_0[:, 0], y=x_0[:, 1],title="Rescaled Decision Boundary Plot")
fig2 = px.scatter(x=x_1[:, 0], y=x_1[:, 1])
# plotting decision boundary
x1 = np.linspace(0.0,1.0, 10)
x2 = -(beta[0,0] + beta[1,0]*x1)/beta[2,0]
#
# Uncomment if you have 3 features
#x3 = np.linspace(0.0,1.0, 10)
#x2 = -(beta[0,0] + beta[1,0]*x1 + beta[3,0]*x3)/beta[2,0]
xline = np.append(x1.reshape(len(x1),1),x2.reshape(len(x2),1),axis=1)
# Uncomment if you have 3 features
#xline = np.append(xline,x2.reshape(len(x3),1),axis=1)
xline = scl.inverse_transform(xline)
fig3= px.line(x=xline[:,0], y=xline[:,1])
#
# Without this, the colors of the two datasets are the same
fig.data[0]['marker'].update(color='red')
fig.data[0]['name']='Background'
fig.data[0]['showlegend']=True
fig2.data[0]['marker'].update(color='blue')
fig2.data[0]['name']='Signal'
fig2.data[0]['showlegend']=True
fig3.data[0]['line'].update(color='goldenrod')
fig3.data[0]['name']='Decision Boundary'
fig3.data[0]['showlegend']=True
#
# This next line actually puts the two datasets on the same plot
# print("fig2.data[0]",fig2.data[0])
fig.add_trace(fig2.data[0])
fig.add_trace(fig3.data[0])
#
# Now we plot them
fig.show()
You will need to pick two variables to use as your features (since we are basing this on the example University Admission dataset). You will need to:
#
# Data location
data_location = '/fs/ess/PAS2038/PHYSICS5680_OSU/data'
#
# Read in all of the other digits
fname = data_location + '/HTRU2/HTRU_2a.csv'
dfAll = pd.read_csv(fname)
dfAll['iclass'] = dfAll['class'].astype(str)
print(dfAll.head())
#
# # Make sure our dataset is balanced
# dfA = dfAll[dfAll['class']==1]
# dfB = dfAll[dfAll['class']==0]
Profile_mean Profile_stdev Profile_skewness Profile_kurtosis DM_mean \
0 140.562500 55.683782 -0.234571 -0.699648 3.199833
1 102.507812 58.882430 0.465318 -0.515088 1.677258
2 103.015625 39.341649 0.323328 1.051164 3.121237
3 136.750000 57.178449 -0.068415 -0.636238 3.642977
4 88.726562 40.672225 0.600866 1.123492 1.178930
DM_stdev DM_skewness DM_kurtosis class iclass
0 19.110426 7.975532 74.242225 0 0
1 14.860146 10.576487 127.393580 0 0
2 21.744669 7.735822 63.171909 0 0
3 20.959280 6.896499 53.593661 0 0
4 11.468720 14.269573 252.567306 0 0
# dfA_skew_mean = dfA[['Profile_skewness','Profile_mean']]
# print(dfA_skew_mean)
# dfB_skew_mean = dfB[['Profile_skewness','Profile_mean']]
# print(dfB_skew_mean)
scl = MinMaxScaler()
df_train,df_test = train_test_split(dfAll, test_size=0.3, random_state=42)
# Get the train data
XToFit = df_train[['Profile_mean','Profile_skewness']].values
yToFit = df_train[['class']].values
#
# Make sure feature data is normalized
XToFit2 = scl.fit_transform(XToFit)
#print("XToFit2",XToFit2.shape)
#XToFit2 = XToFit
print('Before Scaling')
print(XToFit)
print('After Scaling')
print(XToFit2)
print('shape')
print(yToFit.shape,XToFit.shape)
Before Scaling [[ 1.21156250e+02 3.75484665e-01] [ 7.69687500e+01 7.12897860e-01] [ 1.30585938e+02 1.33408289e-01] ... [ 1.16031250e+02 6.63455691e-01] [ 1.35664062e+02 -8.99403060e-02] [ 1.20726562e+02 3.46178079e-01]] After Scaling [[0.62713448 0.21549796] [0.38688302 0.24990139] [0.67840455 0.19081528] ... [0.59926939 0.24486016] [0.70601478 0.16804214] [0.62479823 0.21250979]] shape (12528, 1) (12528, 2)
# Prepend the "ones" column
ones = np.ones((len(XToFit2),1))
XToFit2 = np.append(ones,XToFit2,axis=1)
#
# Make sure label data has the correct shape
yToFit2 = yToFit.reshape(len(yToFit),1)
#
# Check shapes
print(XToFit2.shape)
print(yToFit2.shape)
iterations_max = 10000
learningRate = 0.001
delta = 0.01
Theta,iterations,costList = fit_data(XToFit2,yToFit2,learningRate,iterations_max,delta=delta)
#Theta,costList = fit_data_minimize(XToFit,yToFit,learningRate,iterations)
print("fit Theta ",Theta)
print("iterations ",iterations)
print("cost",costList[-1])
#
# Now apply to the TEST data
#
# Get the test data
XToFit = df_test[['Profile_mean','Profile_skewness']].values
yToFit = df_test[['class']].values
#
# Make sure feature data is normalized
XToFit2 = scl.transform(XToFit)
print("XToFit2",XToFit2.shape)
#XToFit2 = XToFit
#
# Prepend the "ones" column
ones = np.ones((len(XToFit2),1))
XToFit2 = np.append(ones,XToFit2,axis=1)
#
# Make sure label data has the correct shape
yToFit2 = yToFit.reshape(len(yToFit),1)
countTrue = 0
foundTrue = 0
confusion_matrix_test = nested_defaultdict(int,2)
m,features = XToFit2.shape
#
# Loop over our dataset again, and test each sample
for Xtest,yTest in zip(XToFit2,yToFit2):
#
# Make sure the features and label from this sample have the correct shape
Xtest = Xtest.reshape(1,features)
yTest = yTest.reshape(1,1)
#
# Get the probability for this sample
thisProb = get_prob(Theta,Xtest)
#
# Keep track of correct classifications
if yTest.item(0)==1:
countTrue += 1
if thisProb>0.5:
foundTrue += 1
#
# Keep track of correct classifications
trueClass = yTest.item(0)
predClass = 0
if thisProb>0.5:
predClass = 1
confusion_matrix_test[trueClass][predClass] += 1
print("Total True: ",countTrue,"; subset found as true: ",foundTrue)
df = pd.DataFrame.from_dict(confusion_matrix_test,orient='index')
df.sort_index(axis=0,inplace=True)
df.sort_index(axis=1,inplace=True)
#
# Print the dataframe as a table to see what it looks like
from IPython.display import display
print("Rows are true classes, columns are predicted classes:")
display(df)
print("theta ",Theta)
#print("cost ",costList[:-10])
(12528, 3) (12528, 1) Starting theta (3, 1) fit Theta [[-10.8027648 ] [ -1.05653576] [ 34.74840478]] iterations 5121 cost 1055.4608483343309 XToFit2 (5370, 2) Total True: 486 ; subset found as true: 372 Rows are true classes, columns are predicted classes:
| 0 | 1 | |
|---|---|---|
| 0 | 4866 | 18 |
| 1 | 114 | 372 |
theta [[-10.8027648 ] [ -1.05653576] [ 34.74840478]]
import plotly.express as px
# NumPy arrays arguments
fig = px.scatter(x=np.array(range(0,len(costList))), y=costList,
labels={'x':'iteration', 'y':'cost function'})
fig.show()
plot_reg(XToFit2, yToFit2, Theta)
plot_reg_scale(XToFit2, yToFit2, Theta,scl)
x_0 (4884, 3) (486, 3) x1 max,min 1.0 0.001996431908928724 x1 (10,) (10,)
Here we will want to bring in two digits, then split into a train a test sample.
Here you just want to:
NOTE: Make sure you check the number of iterations and the resulting confusion matrix. If there is evidence that your fit is not optimal, you may need to adjust both the learning rate and the delta (downward!).
i ran out of time
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
%matplotlib inline
import pandas as pd
#
# Data location
data_location = '/fs/ess/PAS2038/PHYSICS5680_OSU/data'
#
# Define our "signal" digit
short = ""
#short = "short_"
#
# Read in all of the other digits
dfCombined = pd.DataFrame()
for i in range(2):
digit = 0
if i == 0:
digit = 5
if i == 1:
digit = 7
print("Processing digit ",digit)
fname = data_location + '/ch3/digit_' + short + str(digit) + '.csv'
df = pd.read_csv(fname,header=None)
df['digit'] = digit
dfCombined = pd.concat([dfCombined, df])
print("Length of sample: ",len(dfCombined))
num_features = 784
Processing digit 5 Processing digit 7 Length of sample: 13606
scl = MinMaxScaler()
df_train,df_test = train_test_split(dfCombined, test_size=0.3, random_state=42)
# Get the train data
XtoFit = df_train.iloc[:,:num_features].values
yToFit = df_train.iloc[:,num_features].values
#
# Make sure feature data is normalized
XToFit2 = scl.fit_transform(XToFit)
#print("XToFit2",XToFit2.shape)
#XToFit2 = XToFit
print('Before Scaling')
print(XToFit)
print('After Scaling')
print(XToFit2)
print('shape')
print(yToFit.shape,XToFit.shape)
print(df_train.iloc[:,:num_features].values.shape)
Before Scaling [[0 0 0 ... 0 0 0] [0 0 0 ... 0 0 0] [0 0 0 ... 0 0 0] ... [0 0 0 ... 0 0 0] [0 0 0 ... 0 0 0] [0 0 0 ... 0 0 0]] After Scaling [[0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.] ... [0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.]] shape (9524,) (19048, 784) (9524, 784)
df_train
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | digit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3827 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
| 5983 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
| 5466 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
| 1247 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
| 5796 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5191 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
| 7105 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
| 5390 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
| 860 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
| 957 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
9524 rows × 785 columns
# Prepend the "ones" column
ones = np.ones((len(XToFit2),1))
XToFit2 = np.append(ones,XToFit2,axis=1)
#
# Make sure label data has the correct shape
yToFit2 = yToFit.reshape(len(yToFit),1)
#
# Check shapes
print(XToFit2.shape)
print(yToFit2.shape)
iterations_max = 10000
learningRate = 0.001
delta = 0.01
Theta,iterations,costList = fit_data(XToFit2,yToFit2,learningRate,iterations_max,delta=delta)
#Theta,costList = fit_data_minimize(XToFit,yToFit,learningRate,iterations)
print("fit Theta ",Theta)
print("iterations ",iterations)
print("cost",costList[-1])
#
# Now apply to the TEST data
#
# Get the test data
X_5 = df_test[df_test['digit' == 5]].iloc[:,:num_features]
X_7 = df_test[df_test['digit' == 7]].iloc[:,:num_features]
yToFit = df_test.iloc[:,784].values
XToFit = pd.concat([X_5, X_7]).values
#
# Make sure feature data is normalized
XToFit2 = scl.transform(XToFit)
print("XToFit2",XToFit2.shape)
#XToFit2 = XToFit
#
# Prepend the "ones" column
ones = np.ones((len(XToFit2),1))
XToFit2 = np.append(ones,XToFit2,axis=1)
#
# Make sure label data has the correct shape
yToFit2 = yToFit.reshape(len(yToFit),1)
countTrue = 0
foundTrue = 0
confusion_matrix_test = nested_defaultdict(int,2)
m,features = XToFit2.shape
#
# Loop over our dataset again, and test each sample
for Xtest,yTest in zip(XToFit2,yToFit2):
#
# Make sure the features and label from this sample have the correct shape
Xtest = Xtest.reshape(1,features)
yTest = yTest.reshape(1,1)
#
# Get the probability for this sample
thisProb = get_prob(Theta,Xtest)
#
# Keep track of correct classifications
if yTest.item(0)==1:
countTrue += 1
if thisProb>0.5:
foundTrue += 1
#
# Keep track of correct classifications
trueClass = yTest.item(0)
predClass = 0
if thisProb>0.5:
predClass = 1
confusion_matrix_test[trueClass][predClass] += 1
print("Total True: ",countTrue,"; subset found as true: ",foundTrue)
df = pd.DataFrame.from_dict(confusion_matrix_test,orient='index')
df.sort_index(axis=0,inplace=True)
df.sort_index(axis=1,inplace=True)
#
# Print the dataframe as a table to see what it looks like
from IPython.display import display
print("Rows are true classes, columns are predicted classes:")
display(df)
print("theta ",Theta)
#print("cost ",costList[:-10])
(19048, 785) (9524, 1) Starting theta (785, 1)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-8-02f2c2dbf2bb> in <module> 13 learningRate = 0.001 14 delta = 0.01 ---> 15 Theta,iterations,costList = fit_data(XToFit2,yToFit2,learningRate,iterations_max,delta=delta) 16 #Theta,costList = fit_data_minimize(XToFit,yToFit,learningRate,iterations) 17 print("fit Theta ",Theta) <ipython-input-3-bbd18d09885a> in fit_data(Xp, yp, learningRate, max_iterations, scale, delta) 17 # 18 # Calculate our initial cost ---> 19 cost = calc_cost_logistic(Theta,Xp,yp) 20 cost_change = delta+0.1 21 iterations = 0 <ipython-input-2-fd609829a42a> in calc_cost_logistic(Theta, Xp, yp) 9 def calc_cost_logistic(Theta,Xp,yp): 10 hTheta = sigmoid(-np.dot(Xp,Theta)) ---> 11 cost = np.dot(-yp.T,np.log(hTheta)) - np.dot((1.0 -yp).T,np.log(1.0-hTheta)) 12 # 13 # Cost above is a 1x1 matrix - pull out the single value and return it <__array_function__ internals> in dot(*args, **kwargs) ValueError: shapes (1,9524) and (19048,1) not aligned: 9524 (dim 1) != 19048 (dim 0)
(13606, 784)
Try to do a simple version of a multi-class classification problem using MNIST: use 3 digits (example: 5,6,7). Your primary output should be a confusion matrix.
Hint:
The only output expected is a confusion matrix. Ask if you have questions!